dat %>%
ggplot(aes(year, lakeid, fill=is.na(daynum))) +
geom_tile(color="grey") +
theme_bw() +
facet_wrap(~metric, nrow=6)
Remaining data issues:
Predict DOC daynum from other variable daynum’s in each northern lake
no_dups = dat %>% group_by(lakeid, year, metric) %>% summarise(N = n()) %>% filter(N == 1)
## `summarise()` has grouped output by 'lakeid', 'year'. You can override using
## the `.groups` argument.
n_lakes_wide = left_join(no_dups, dat) %>%
select(-sampledate) %>%
pivot_wider(names_from = metric, values_from = daynum) %>%
filter(lakeid %in% c("AL", "BM", "CB", "CR", "SP", "TB", "TR"))
## Joining, by = c("lakeid", "year", "metric")
hold_data = na.omit(data.frame(n_lakes_wide %>% ungroup() %>% select(-year, -N))) # , -doc_predict
all = lm(doc_epiMax~(iceon+straton+stratoff+energy+
stability+anoxia_summer+
secchi_openwater_max)*lakeid, data=hold_data)
i_o = lm(doc_epiMax~1, data=hold_data)
hold_step = step(i_o, scope=formula(all))
## Start: AIC=1761.44
## doc_epiMax ~ 1
##
## Df Sum of Sq RSS AIC
## + lakeid 6 74137 395186 1733.7
## + stratoff 1 15586 453737 1755.6
## <none> 469323 1761.4
## + secchi_openwater_max 1 1334 467989 1762.8
## + iceon 1 565 468758 1763.2
## + straton 1 551 468772 1763.2
## + anoxia_summer 1 311 469012 1763.3
## + energy 1 305 469018 1763.3
## + stability 1 65 469258 1763.4
##
## Step: AIC=1733.72
## doc_epiMax ~ lakeid
##
## Df Sum of Sq RSS AIC
## + stratoff 1 6275 388911 1732.0
## <none> 395186 1733.7
## + anoxia_summer 1 2791 392395 1734.1
## + energy 1 2484 392702 1734.3
## + iceon 1 595 394591 1735.4
## + straton 1 121 395065 1735.7
## + secchi_openwater_max 1 60 395126 1735.7
## + stability 1 56 395130 1735.7
## - lakeid 6 74137 469323 1761.4
##
## Step: AIC=1732.03
## doc_epiMax ~ lakeid + stratoff
##
## Df Sum of Sq RSS AIC
## <none> 388911 1732.0
## + energy 1 3173 385738 1732.1
## + anoxia_summer 1 3055 385856 1732.2
## - stratoff 1 6275 395186 1733.7
## + iceon 1 367 388544 1733.8
## + stability 1 232 388679 1733.9
## + straton 1 218 388693 1733.9
## + secchi_openwater_max 1 27 388884 1734.0
## + stratoff:lakeid 6 10379 378532 1737.8
## - lakeid 6 64826 453737 1755.6
doc_model = lm(doc_epiMax~(iceon+straton+stratoff+energy+
stability+anoxia_summer+
secchi_openwater_max)*lakeid,
data=n_lakes_wide,
na.action = na.exclude)
summary(doc_model)
##
## Call:
## lm(formula = doc_epiMax ~ (iceon + straton + stratoff + energy +
## stability + anoxia_summer + secchi_openwater_max) * lakeid,
## data = n_lakes_wide, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -150.321 -17.754 2.572 22.905 119.066
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.565e+01 1.691e+02 0.388 0.6983
## iceon 2.499e-01 3.647e-01 0.685 0.4940
## straton 1.685e-01 3.627e-01 0.465 0.6428
## stratoff 5.087e-01 3.117e-01 1.632 0.1044
## energy -1.686e-01 2.244e-01 -0.751 0.4535
## stability -1.210e-01 1.494e-01 -0.810 0.4192
## anoxia_summer -7.801e-02 1.775e-01 -0.440 0.6607
## secchi_openwater_max -8.199e-02 8.251e-02 -0.994 0.3217
## lakeid.L 2.877e+02 4.488e+02 0.641 0.5223
## lakeid.Q 3.595e+02 4.370e+02 0.823 0.4118
## lakeid.C 2.718e+02 4.509e+02 0.603 0.5474
## lakeid^4 1.705e+02 4.573e+02 0.373 0.7097
## lakeid^5 -3.865e+02 4.530e+02 -0.853 0.3948
## lakeid^6 3.226e+02 4.371e+02 0.738 0.4615
## iceon:lakeid.L -1.070e+00 9.624e-01 -1.112 0.2676
## iceon:lakeid.Q -5.617e-01 9.511e-01 -0.591 0.5555
## iceon:lakeid.C -4.825e-01 9.578e-01 -0.504 0.6151
## iceon:lakeid^4 5.985e-02 1.000e+00 0.060 0.9524
## iceon:lakeid^5 3.784e-01 9.531e-01 0.397 0.6918
## iceon:lakeid^6 -1.010e-01 9.630e-01 -0.105 0.9166
## straton:lakeid.L -9.552e-01 9.708e-01 -0.984 0.3265
## straton:lakeid.Q 1.588e-01 8.818e-01 0.180 0.8573
## straton:lakeid.C 1.778e-01 9.835e-01 0.181 0.8567
## straton:lakeid^4 6.986e-01 1.034e+00 0.676 0.5001
## straton:lakeid^5 1.452e+00 9.960e-01 1.458 0.1467
## straton:lakeid^6 7.466e-01 8.815e-01 0.847 0.3981
## stratoff:lakeid.L -7.548e-02 8.350e-01 -0.090 0.9281
## stratoff:lakeid.Q -3.476e-01 7.480e-01 -0.465 0.6427
## stratoff:lakeid.C -1.518e-02 8.381e-01 -0.018 0.9856
## stratoff:lakeid^4 -5.921e-01 9.186e-01 -0.645 0.5200
## stratoff:lakeid^5 8.062e-01 8.412e-01 0.958 0.3391
## stratoff:lakeid^6 -1.532e+00 7.542e-01 -2.032 0.0436 *
## energy:lakeid.L 3.454e-01 5.974e-01 0.578 0.5639
## energy:lakeid.Q -4.228e-01 5.525e-01 -0.765 0.4452
## energy:lakeid.C -1.824e-01 5.978e-01 -0.305 0.7606
## energy:lakeid^4 -9.815e-02 6.504e-01 -0.151 0.8802
## energy:lakeid^5 -4.342e-01 5.982e-01 -0.726 0.4689
## energy:lakeid^6 -5.155e-02 5.604e-01 -0.092 0.9268
## stability:lakeid.L -9.498e-02 3.465e-01 -0.274 0.7843
## stability:lakeid.Q 1.516e-01 3.811e-01 0.398 0.6913
## stability:lakeid.C -8.111e-02 3.990e-01 -0.203 0.8392
## stability:lakeid^4 -3.708e-01 3.482e-01 -1.065 0.2883
## stability:lakeid^5 -1.637e-01 4.454e-01 -0.368 0.7137
## stability:lakeid^6 -1.004e-01 4.403e-01 -0.228 0.8198
## anoxia_summer:lakeid.L 2.287e-01 4.741e-01 0.482 0.6302
## anoxia_summer:lakeid.Q 1.894e-01 4.752e-01 0.399 0.6907
## anoxia_summer:lakeid.C -9.741e-03 4.518e-01 -0.022 0.9828
## anoxia_summer:lakeid^4 -1.033e-01 5.073e-01 -0.204 0.8389
## anoxia_summer:lakeid^5 -5.992e-02 4.282e-01 -0.140 0.8889
## anoxia_summer:lakeid^6 9.392e-02 4.767e-01 0.197 0.8440
## secchi_openwater_max:lakeid.L 5.057e-01 2.347e-01 2.155 0.0325 *
## secchi_openwater_max:lakeid.Q -4.229e-01 2.348e-01 -1.801 0.0733 .
## secchi_openwater_max:lakeid.C -1.705e-01 2.244e-01 -0.760 0.4483
## secchi_openwater_max:lakeid^4 -9.056e-02 1.956e-01 -0.463 0.6438
## secchi_openwater_max:lakeid^5 -1.488e-01 2.137e-01 -0.697 0.4870
## secchi_openwater_max:lakeid^6 1.677e-01 2.037e-01 0.823 0.4115
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42.93 on 180 degrees of freedom
## (37 observations deleted due to missingness)
## Multiple R-squared: 0.2949, Adjusted R-squared: 0.07939
## F-statistic: 1.368 on 55 and 180 DF, p-value: 0.06508
n_lakes_wide$doc_predict = predict(doc_model, n_lakes_wide)
cor = n_lakes_wide %>%
group_by(lakeid) %>%
summarise(r = paste("r =", round(cor(doc_epiMax, doc_predict, use="complete.obs"), 3)))
n_lakes_wide %>%
ggplot(aes(x=doc_epiMax, y=doc_predict, color=as.character(lakeid))) +
geom_point()+
theme_bw() +
labs(color="lakeid")+
geom_abline(slope=1, intercept = 0) +
facet_wrap(~lakeid) +
geom_text(aes(label=r), data=cor, x=300, y=0) +
labs(x="obs peak DOC (daynum)", y="modeled peak DOC (daynum)")
## Warning: Removed 37 rows containing missing values (`geom_point()`).
# not good but vaguely positive relationship; use this instead of median?
missing_doc = n_lakes_wide %>% filter(is.na(doc_epiMax))
missing_doc$doc_fill = round(predict(doc_model, missing_doc))
doc_fill = missing_doc %>%
select(lakeid, year, doc_fill) %>%
rename(daynum_fill = doc_fill) %>%
mutate(metric = "doc_epiMax") %>%
filter(year < 2000)
all_fill = bind_rows(doc_fill)
dat_filled = full_join(dat, all_fill) %>%
mutate(filled_flag = ifelse(is.na(daynum) & !is.na(daynum_fill), TRUE, FALSE)) %>%
mutate(daynum_fill = ifelse(is.na(daynum) & !is.na(daynum_fill), daynum_fill, daynum))
## Joining, by = c("lakeid", "metric", "year")
dat_filled$lakeid = factor(dat_filled$lakeid,
levels = c("AL", "BM", "CB", "CR", "SP", "TB", "TR", "FI", "ME", "MO", "WI"),
ordered = T)
vars = c("iceoff", "straton", "secchi_all", "secchi_openwater_max","secchi_openwater_min", "zoopDensity","zoopDensity_CC",
"doc_epiMax","totpuf_epiMax", "totpuf_epiMin", "totpuf_hypoMax", "totpuf_hypoMin",
"anoxia", "anoxia_summer", "stability", "energy", "stratoff", "iceon")
dat_filled$metric = factor(dat_filled$metric,
levels = rev(vars),
ordered=T)
dat_filled %>%
ggplot(aes(year, metric, fill=filled_flag)) +
geom_tile(color="grey") +
theme_bw() +
facet_wrap(~lakeid, nrow=6)
## Additional trimming/filling
Trim to “good” years, fill FI ice data with Monona, then fill additional missing with median values for each lake/metric
df_yearsWant = dat_filled %>%
filter((lakeid %in% c("FI", "ME", "MO", "WI") & year %in% 1996:2018) |
(!(lakeid %in% c("FI", "ME", "MO", "WI")) & year %in% 1982:2018))
fi_icefill = df_yearsWant %>%
filter(lakeid == "MO" & metric %in% c("iceoff", "iceon")) %>%
mutate(lakeid = "FI") %>%
select(-daynum, -filled_flag, -sampledate) %>%
rename(icefill = daynum_fill)
df_yearsWant = df_yearsWant %>%
full_join(fi_icefill) %>%
mutate(daynum_fill = ifelse(is.na(daynum_fill) & !is.na(icefill),
icefill, daynum),
filled_flag = ifelse(is.na(daynum) & !is.na(daynum_fill) & !is.na(icefill),
TRUE, filled_flag))
## Joining, by = c("lakeid", "metric", "year")
df_yearsWant %>%
ggplot(aes(year, metric, fill=filled_flag)) +
geom_tile(color="grey") +
theme_bw() +
facet_wrap(~lakeid, nrow=6)
all_lys = df_yearsWant %>%
select(lakeid, year) %>%
distinct()
metric = unique(df_yearsWant$metric)
all_lys_want = expand_grid(all_lys, metric)
dat_final = left_join(all_lys_want, df_yearsWant)
## Joining, by = c("lakeid", "year", "metric")
medians = dat_final %>%
group_by(lakeid, metric) %>%
summarise(median_daynum = median(daynum_fill, na.rm=T))
## `summarise()` has grouped output by 'lakeid'. You can override using the
## `.groups` argument.
dat_final = dat_final %>%
left_join(medians) %>%
mutate(daynum_fill = ifelse(is.na(daynum_fill), median_daynum, daynum_fill)) %>%
mutate(filled_flag = ifelse(is.na(daynum) & !is.na(daynum_fill), TRUE, filled_flag)) %>%
select(-icefill, -median_daynum)
## Joining, by = c("lakeid", "metric")
dat_final %>%
ggplot(aes(year, metric, fill=filled_flag)) +
geom_tile(color="grey") +
theme_bw() +
facet_wrap(~lakeid, nrow=6)
write_csv(dat_final, "Data/analysis_ready/final_combined_dates_filled_v2.csv")